library(tidyverse)

data <- read.csv("data_full.csv")

#### Data cleaning on Qualtrics export ####

library(lubridate)

data <- data[-c(1,2),] # first 2 rows are questions themselves

data$StartDate <- ymd_hms(data$StartDate, tz = "America/Denver")
data$EndDate <- ymd_hms(data$EndDate, tz = "America/Denver")

dataEndType <- data %>% 
  mutate(end_type = case_when(end_type == "finished" ~ "finished",
                              end_type == "failed_screener" ~ "failed_screener",
                              end_type == "" ~ "dropout"))

library(naniar)

#### Cases with != finished failed the attention check or never completed the survey ####

data <- data %>% 
  filter(end_type == "finished") %>% 
  select(ResponseId, megye, Q93:Q111, Q160_6, Q159, Q158, Q4, Q5, Q144, Q143, Q141, Q142, Q150_DO, Q154:Q157, LocationLatitude, LocationLongitude) %>% 
  replace_with_na_all(condition = ~.x == "")

# create a single variable for respondents' hometown: 

data$settlement <- coalesce(data$Q93, data$Q94, data$Q95, data$Q96, data$Q97, data$Q98, data$Q99, data$Q100, data$Q101, data$Q102, data$Q103, data$Q104, data$Q105, data$Q106, data$Q107, data$Q108, data$Q109, data$Q110, data$Q111) 

# almost perfect: remaining NAs are for Budapest, as it's measured on the county-level

data <- data %>% 
  mutate(settlement = case_when(is.na(settlement) ~ "Budapest",
                                TRUE ~ as.character(.$settlement)))

data <- data %>% 
  select(-c(Q93:Q111)) %>% 
  mutate(treatment = case_when(is.na(Q150_DO) == TRUE ~ 1,
                               is.na(Q150_DO) == FALSE ~ 0)) # when no random settlement is displayed, displayed == respondents' hometown

# Recoding Likert scale value labels to numeric: 

data <- data %>% 
  mutate(Q154 = recode(Q154, "Nagyon valószínű" = 5, "Inkább valószínű" = 4, "Is-Is" = 3, "Inkább valószínűtlen" = 2, "Kimondottan valószínűtlen" = 1, "NA" = NA_real_),
         Q144 = recode(Q144, "Nagyon valószínű" = 5, "Inkább valószínű" = 4, "Is-Is" = 3, "Inkább valószínűtlen" = 2, "Kimondottan valószínűtlen" = 1, "NA" = NA_real_),
         Q155 = recode(Q155, "Nagyon valószínű" = 5, "Inkább valószínű" = 4, "Is-Is" = 3, "Inkább valószínűtlen" = 2, "Kimondottan valószínűtlen" = 1, "NA" = NA_real_),
         Q143 = recode(Q143, "Nagyon valószínű" = 5, "Inkább valószínű" = 4, "Is-Is" = 3, "Inkább valószínűtlen" = 2, "Kimondottan valószínűtlen" = 1, "NA" = NA_real_),
         Q141 = recode(Q141, "Nagyon valószínű" = 5, "Inkább valószínű" = 4, "Is-Is" = 3, "Inkább valószínűtlen" = 2, "Kimondottan valószínűtlen" = 1, "NA" = NA_real_),
         Q156 = recode(Q156, "Nagyon valószínű" = 5, "Inkább valószínű" = 4, "Is-Is" = 3, "Inkább valószínűtlen" = 2, "Kimondottan valószínűtlen" = 1, "NA" = NA_real_),
         Q4 = recode(Q4, "Nagyon liberális" = 1, "Inkább liberális" = 2, "Se nem liberális, se nem konzervatív" = 3, "Inkább konzervatív" = 4, "Nagyon konzervatív" = 5),
         Q142 = recode(Q142, "Nagyon liberális" = 1, "Inkább liberális" = 2, "Se nem liberális, se nem konzervatív" = 3, "Inkább konzervatív" = 4, "Nagyon konzervatív" = 5),
         Q157 = recode(Q157, "Nagyon liberális" = 1, "Inkább liberális" = 2, "Se nem liberális, se nem konzervatív" = 3, "Inkább konzervatív" = 4, "Nagyon konzervatív" = 5)) %>% 
  mutate(vote = coalesce(Q144, Q154),
         pork = coalesce(Q155, Q143),
         clientelism = coalesce(Q141, Q156),
         ideology = coalesce(Q142, Q157))

#### Analysis ####

fit <- lm(clientelism ~ treatment, data = data)
summary(fit)

fit2 <- lm(pork ~ treatment, data = data)
summary(fit2)

fit3 <- lm(vote ~ treatment, data = data)
summary(fit3)

fit4 <- lm(ideology ~ Q4 + treatment + Q4*treatment, data = data)
summary(fit4)

#### Visualizing Results ####

library(estimatr)
library(ggplot2)

library(dotwhisker)
library(broom)
library(ggthemes)

library(tikzDevice)
tikz(file = "plot_vignette.tex", width = 5, height = 5)

dwplot(list(fit3, fit, fit2), dot_args = list(size = 2.5)) %>% 
  relabel_predictors(c(treatment_client = "Clientelism",
                       treatment_pork = "Pork barrel")) +
  theme_bw() + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        strip.background = element_rect(fill="white"), axis.text.y=element_blank(), legend.title = element_blank(), legend.position="bottom", legend.box = "horizontal", legend.direction = "horizontal") +
  guides(colour = guide_legend(nrow = 2)) + xlab("Coefficient Estimate") + ylab("") +
  geom_vline(xintercept = 0, colour = "grey", linetype = 2) +
  scale_colour_manual(values = c("#4daf4a", "#ca0020", "#0571b0")) +
  scale_color_hue(labels = c("Voting", "Clientelism", "Pork barrel"))

print(plot)
dev.off()

#### Geocoding settlements ####

library(ggmap)

data <- data %>% 
  mutate(Q150_DO = case_when(is.na(Q150_DO) ~ as.character(.$settlement),
                             TRUE ~ as.character(.$Q150_DO)))

displayed_settlements <- data %>% 
  select(Q150_DO) %>% 
  unique()

register_google(key = "Your Google API credentials here")

library(httr)
set_config(config(ssl_verifypeer = 0L))

displayed_geo <- mutate_geocode(displayed_settlements, Q150_DO)

hometowns_survey_takers <- data %>% 
  select(settlement) %>% 
  unique()

hometowns_geo <- mutate_geocode(hometowns_survey_takers, settlement)

hometowns_geo_corr <- hometowns_geo %>% 
  filter(lon >= 16) %>% 
  filter(lon <= 23) %>% 
  rename(surveyedLon = lon,
         surveyedLat = lat)

write.csv2(hometowns_geo_corr, "hometowns_geo_corr.csv")
  
displayed_geo_corr <- displayed_geo %>% 
  unique() %>% 
  filter(lon >= 16) %>% 
  filter(lon <= 23) %>% 
  rename(dispLon = lon,
         dispLat = lat)

write.csv2(displayed_geo_corr, "displayed_geo_corr.csv")

# Percentage of erroneously assigned coordinates: 

(nrow(hometowns_geo)-nrow(hometowns_geo_corr))/nrow(hometowns_geo)*100 # 7.167
(nrow(displayed_geo)-nrow(displayed_geo_corr))/nrow(displayed_geo)*100 # 7.299

data <- data %>% 
  full_join(hometowns_geo_corr, by = "settlement") %>% 
  full_join(displayed_geo_corr, by = "Q150_DO")

library(geodist)

data$distance <- geodist_vec(x1 = data$surveyedLon, y1 = data$surveyedLat, x2 = data$dispLon, y2 = data$dispLat, paired = TRUE, measure = "haversine")

options(scipen = 999)

data <- data %>% 
  mutate(distance_km = distance/1000)

fitDist <- lm(vote ~ distance_km, data = data)
summary(fitDist)

fitDistPork <- lm(pork ~ distance_km, data = data)
summary(fitDistPork)

fitDistClient<- lm(clientelism ~ distance_km, data = data)
summary(fitDistClient)

tikz(file = "plot_distance.tex", width = 5, height = 5)

dwplot(list(fitDist, fitDistPork, fitDistClient), dot_args = list(size = 2.5)) %>% 
  relabel_predictors(c(treatment_client = "Clientelism",
                       treatment_pork = "Pork barrel")) +
  theme_bw() + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        strip.background = element_rect(fill="white"), axis.text.y=element_blank(), legend.title = element_blank(), legend.position="bottom", legend.box = "horizontal", legend.direction = "horizontal") +
  guides(colour = guide_legend(nrow = 2)) + xlab("Coefficient Estimate") + ylab("") +
  geom_vline(xintercept = 0, colour = "grey", linetype = 2) +
  scale_colour_manual(values = c("#4daf4a", "#ca0020", "#0571b0")) +
  scale_color_hue(labels = c("Voting", "Pork barrel", "Clientelism"))

print(plot)
dev.off()
